Datascience Project (England Weather)¶

Import libraries
In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="darkgrid")
import plotly.graph_objects as go
import plotly.express as px
Import Dataset
In [2]:
data = pd.read_csv('EnglandWeather.csv')
data
Out[2]:
Formatted Date Summary Precip Type Temperature (C) Wind Speed (km/h) Pressure (millibars) Humidity
0 2006-04-01 00:00:00.000 +0200 Partly Cloudy rain 9.472222 14.1197 1015.13 0.89
1 2006-04-01 01:00:00.000 +0200 Partly Cloudy rain 9.355556 14.2646 1015.63 0.86
2 2006-04-01 02:00:00.000 +0200 Mostly Cloudy rain 9.377778 3.9284 1015.94 0.89
3 2006-04-01 03:00:00.000 +0200 Partly Cloudy rain 8.288889 14.1036 1016.41 0.83
4 2006-04-01 04:00:00.000 +0200 Mostly Cloudy rain 8.755556 11.0446 1016.51 0.83
... ... ... ... ... ... ... ...
96448 2016-09-09 19:00:00.000 +0200 Partly Cloudy rain 26.016667 10.9963 1014.36 0.43
96449 2016-09-09 20:00:00.000 +0200 Partly Cloudy rain 24.583333 10.0947 1015.16 0.48
96450 2016-09-09 21:00:00.000 +0200 Partly Cloudy rain 22.038889 8.9838 1015.66 0.56
96451 2016-09-09 22:00:00.000 +0200 Partly Cloudy rain 21.522222 10.5294 1015.95 0.60
96452 2016-09-09 23:00:00.000 +0200 Partly Cloudy rain 20.438889 5.8765 1016.16 0.61

96453 rows × 7 columns

In [3]:
df = pd.DataFrame(data)
df
Out[3]:
Formatted Date Summary Precip Type Temperature (C) Wind Speed (km/h) Pressure (millibars) Humidity
0 2006-04-01 00:00:00.000 +0200 Partly Cloudy rain 9.472222 14.1197 1015.13 0.89
1 2006-04-01 01:00:00.000 +0200 Partly Cloudy rain 9.355556 14.2646 1015.63 0.86
2 2006-04-01 02:00:00.000 +0200 Mostly Cloudy rain 9.377778 3.9284 1015.94 0.89
3 2006-04-01 03:00:00.000 +0200 Partly Cloudy rain 8.288889 14.1036 1016.41 0.83
4 2006-04-01 04:00:00.000 +0200 Mostly Cloudy rain 8.755556 11.0446 1016.51 0.83
... ... ... ... ... ... ... ...
96448 2016-09-09 19:00:00.000 +0200 Partly Cloudy rain 26.016667 10.9963 1014.36 0.43
96449 2016-09-09 20:00:00.000 +0200 Partly Cloudy rain 24.583333 10.0947 1015.16 0.48
96450 2016-09-09 21:00:00.000 +0200 Partly Cloudy rain 22.038889 8.9838 1015.66 0.56
96451 2016-09-09 22:00:00.000 +0200 Partly Cloudy rain 21.522222 10.5294 1015.95 0.60
96452 2016-09-09 23:00:00.000 +0200 Partly Cloudy rain 20.438889 5.8765 1016.16 0.61

96453 rows × 7 columns

Preprocesing
In [4]:
df.shape
Out[4]:
(96453, 7)
In [5]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96453 entries, 0 to 96452
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Formatted Date        96453 non-null  object 
 1   Summary               96453 non-null  object 
 2   Precip Type           95936 non-null  object 
 3   Temperature (C)       96453 non-null  float64
 4   Wind Speed (km/h)     96453 non-null  float64
 5   Pressure (millibars)  96453 non-null  float64
 6   Humidity              96453 non-null  float64
dtypes: float64(4), object(3)
memory usage: 5.2+ MB
In [6]:
df.describe(include='all')
Out[6]:
Formatted Date Summary Precip Type Temperature (C) Wind Speed (km/h) Pressure (millibars) Humidity
count 96453 96453 95936 96453.000000 96453.000000 96453.000000 96453.000000
unique 96429 27 2 NaN NaN NaN NaN
top 2010-08-02 00:00:00.000 +0200 Partly Cloudy rain NaN NaN NaN NaN
freq 2 31733 85224 NaN NaN NaN NaN
mean NaN NaN NaN 11.932678 10.810640 1003.235956 0.734899
std NaN NaN NaN 9.551546 6.913571 116.969906 0.195473
min NaN NaN NaN -21.822222 0.000000 0.000000 0.000000
25% NaN NaN NaN 4.688889 5.828200 1011.900000 0.600000
50% NaN NaN NaN 12.000000 9.965900 1016.450000 0.780000
75% NaN NaN NaN 18.838889 14.135800 1021.090000 0.890000
max NaN NaN NaN 39.905556 63.852600 1046.380000 1.000000
In [7]:
df.isnull().sum()
Out[7]:
Formatted Date            0
Summary                   0
Precip Type             517
Temperature (C)           0
Wind Speed (km/h)         0
Pressure (millibars)      0
Humidity                  0
dtype: int64
In [8]:
df = df.dropna()
In [9]:
df.shape
Out[9]:
(95936, 7)
Visualization
In [10]:
year = pd.to_datetime(df['Formatted Date'], utc = True).dt.year
year
Out[10]:
0        2006
1        2006
2        2006
3        2006
4        2006
         ... 
96448    2016
96449    2016
96450    2016
96451    2016
96452    2016
Name: Formatted Date, Length: 95936, dtype: int32
In [11]:
plt.title("Num of Rows per Year")
sns.countplot(x=year)
plt.grid()
#plt.savefig('Num of Rows per Year.jpeg')
In [12]:
df.insert(7, "Year", year)
df
Out[12]:
Formatted Date Summary Precip Type Temperature (C) Wind Speed (km/h) Pressure (millibars) Humidity Year
0 2006-04-01 00:00:00.000 +0200 Partly Cloudy rain 9.472222 14.1197 1015.13 0.89 2006
1 2006-04-01 01:00:00.000 +0200 Partly Cloudy rain 9.355556 14.2646 1015.63 0.86 2006
2 2006-04-01 02:00:00.000 +0200 Mostly Cloudy rain 9.377778 3.9284 1015.94 0.89 2006
3 2006-04-01 03:00:00.000 +0200 Partly Cloudy rain 8.288889 14.1036 1016.41 0.83 2006
4 2006-04-01 04:00:00.000 +0200 Mostly Cloudy rain 8.755556 11.0446 1016.51 0.83 2006
... ... ... ... ... ... ... ... ...
96448 2016-09-09 19:00:00.000 +0200 Partly Cloudy rain 26.016667 10.9963 1014.36 0.43 2016
96449 2016-09-09 20:00:00.000 +0200 Partly Cloudy rain 24.583333 10.0947 1015.16 0.48 2016
96450 2016-09-09 21:00:00.000 +0200 Partly Cloudy rain 22.038889 8.9838 1015.66 0.56 2016
96451 2016-09-09 22:00:00.000 +0200 Partly Cloudy rain 21.522222 10.5294 1015.95 0.60 2016
96452 2016-09-09 23:00:00.000 +0200 Partly Cloudy rain 20.438889 5.8765 1016.16 0.61 2016

95936 rows × 8 columns

In [13]:
data = df['Year'].value_counts().reset_index()
data.columns = ['Year', 'Count']

# Create a pie chart
fig = px.pie(data, values='Count', names='Year', hole=0.2)
fig.update_layout(title_text="Distribution of Years", title_x=0.5)

fig.show()
In [14]:
df.hist(bins=100,figsize=(24,20))
plt.show()
In [15]:
features=df.columns[3:]
features
Out[15]:
Index(['Temperature (C)', 'Wind Speed (km/h)', 'Pressure (millibars)',
       'Humidity', 'Year'],
      dtype='object')
In [16]:
#histplot with density estimate line
plt.figure(figsize=(22, 20))
for i, feature in enumerate(features):
    plt.subplot(3, 2, i+1)
    sns.histplot(df[feature], stat="density", kde=True)
    plt.title(f"Distribution of {feature}")
plt.tight_layout()
plt.show()
In [17]:
plt.figure(figsize=(24, 20))
for i, feature in enumerate(features):
    plt.subplot(3, 2, i+1)
    sns.boxplot(data=df, x=feature)
    plt.title(f"{feature}")
plt.tight_layout()
plt.show()
In [18]:
plt.figure (figsize =(10,6), dpi=80)

plt.scatter(df['Temperature (C)'], df['Humidity'], color = '#17becf', s=10)
plt.title ("Humidity & Temperature (C)" , backgroundcolor = '#D4D587', color = 'white' , fontsize = 23)
plt.xlabel ('Temperature (C)', fontsize = 18)
plt.ylabel ('Humidity', fontsize = 18)
plt.grid()
#plt.savefig('Humidity & Temperature (C).jpeg')
plt.show()
In [19]:
plt.figure (figsize =(10,6), dpi=80)

plt.scatter(df['Wind Speed (km/h)'], df['Humidity'], color = '#17becf', s=10)
plt.title ("Humidity & Wind Speed (km/h)" , backgroundcolor = '#D4D587', color = 'white' , fontsize = 23)
plt.xlabel ('Wind Speed (km/h)', fontsize = 18)
plt.ylabel ('Humidity', fontsize = 18)
plt.grid()
#plt.savefig('Humidity & Wind Speed.jpeg')
plt.show()
In [20]:
plt.figure (figsize =(10,6), dpi=80)

plt.scatter(df['Pressure (millibars)'], df['Humidity'], color = '#17becf')
plt.title ("Humidity & Pressure (millibars)" , backgroundcolor = '#D4D587', color = 'white' , fontsize = 23)
plt.xlabel ('Pressure (millibars)', fontsize = 18)
plt.ylabel ('Humidity', fontsize = 18)
plt.grid()
#plt.savefig('Humidity & Pressure (millibars).jpeg')
plt.show()
In [21]:
plt.figure (figsize =(10,6), dpi=80)

plt.scatter(df['Summary'], df['Humidity'], color = '#17becf')
plt.title ("Humidity & Summary of weather" , backgroundcolor = '#D4D587', color = 'white' , fontsize = 23)
plt.xlabel ('Summary', fontsize = 18)
plt.ylabel ('Humidity', fontsize = 18)
plt.grid()
plt.xticks(rotation = 90)
plt.savefig('Humidity & Summary of weather.png')
plt.show()
In [22]:
plt.figure (figsize =(10,6), dpi=80)

plt.scatter(df['Precip Type'], df['Humidity'], color = '#17becf')
plt.title ("Humidity & Precip Type" , backgroundcolor = '#D4D587', color = 'white' , fontsize = 23)
plt.xlabel ('Precip Type', fontsize = 18)
plt.ylabel ('Humidity', fontsize = 18)
plt.grid()
plt.xticks(rotation = 90)
#plt.savefig('Humidity & Precip Type.jpeg')
plt.show()
In [23]:
plt.figure (figsize =(10,6), dpi=80)

plt.scatter(df['Temperature (C)'], df['Humidity'], color = '#17becf')
plt.scatter(df['Wind Speed (km/h)'], df['Humidity'], color = '#DE2222')

plt.title ('Temperature (C) & Wind Speed (km/h)', backgroundcolor = '#D4D587', color = 'white' , fontsize = 23)
plt.xlabel ('Temperature (C)', fontsize = 18)
plt.ylabel ('Humidity', fontsize = 18)
plt.legend(['Temperature (C)', 'Wind speed (km/h)'])
plt.grid()
#plt.savefig('Temperature & Wind Speed.jpeg')
plt.show()